library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(car)
## Warning: package 'car' was built under R version 3.6.3
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
library(pROC)
## Warning: package 'pROC' was built under R version 3.6.3
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(ResourceSelection)
## Warning: package 'ResourceSelection' was built under R version 3.6.3
## ResourceSelection 0.3-5 2019-07-22
soccer <- read.csv("spi_matches.csv")
str(soccer)
## 'data.frame': 34580 obs. of 22 variables:
## $ date : Factor w/ 1330 levels "2016-08-12","2016-08-13",..: 1 1 2 2 2 2 2 2 2 2 ...
## $ league_id : int 1843 1843 2411 2411 2411 2411 2411 2411 1843 2411 ...
## $ league : Factor w/ 37 levels "Argentina Primera Division",..: 13 13 4 4 4 4 4 4 13 4 ...
## $ team1 : Factor w/ 758 levels "1. FC Heidenheim 1846",..: 80 50 340 186 222 438 124 631 106 425 ...
## $ team2 : Factor w/ 758 levels "1. FC Heidenheim 1846",..: 510 314 395 742 687 655 665 739 643 658 ...
## $ spi1 : num 51.2 68.8 53.6 55.2 68 ...
## $ spi2 : num 85.7 56.5 66.8 58.7 73.2 ...
## $ prob1 : num 0.0463 0.5714 0.3459 0.4214 0.391 ...
## $ prob2 : num 0.838 0.167 0.362 0.294 0.34 ...
## $ probtie : num 0.116 0.262 0.292 0.285 0.269 ...
## $ proj_score1: num 0.91 1.82 1.16 1.35 1.47 1.3 1.37 1.91 1.39 2.69 ...
## $ proj_score2: num 2.36 0.86 1.24 1.14 1.38 1.01 1.05 1.05 1.14 0.48 ...
## $ importance1: num 32.4 53.7 38.1 43.6 31.9 33.9 36.5 34.1 37.9 73 ...
## $ importance2: num 67.7 22.9 22.2 34.6 48 32.5 29.1 30.7 44.2 27 ...
## $ score1 : int 0 2 2 0 1 1 0 1 3 2 ...
## $ score2 : int 1 2 1 1 1 1 1 1 2 1 ...
## $ xg1 : num 0.97 2.45 0.85 1.11 0.73 1.4 1.24 1.05 1.03 2.14 ...
## $ xg2 : num 0.63 0.77 2.77 0.68 1.11 0.55 1.84 0.22 1.84 1.25 ...
## $ nsxg1 : num 0.43 1.75 0.17 0.84 0.88 1.13 1.71 1.52 1.1 1.81 ...
## $ nsxg2 : num 0.45 0.42 1.25 1.6 1.81 1.06 1.56 0.41 2.26 0.92 ...
## $ adj_score1 : num 0 2.1 2.1 0 1.05 1.05 0 1.05 3.12 2.1 ...
## $ adj_score2 : num 1.05 2.1 1.05 1.05 1.05 1.05 1.05 1.05 2.1 1.05 ...
Questions Raised:
How do the SPI’s of top leagues compare?
At what level do international competitions play at?
How is a single league distributed in terms of SPI?
Challenge: Format of rows is not useful for us
Solution: We must transform
library(lubridate)
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
##
## date
soccer$date <- ymd(soccer$date)
Lubridate can be used to transform the date column from a factor object to a date object
library(tidyr)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
id <- rownames(soccer)
soccer <- cbind(id = id, soccer)
names(soccer)[1] <- "match_id"
soccer <-
unique(soccer %>% pivot_longer(team1:team2, names_to = "HoA", values_to = "team"))
soccer$HoA[soccer$HoA == "team1"] <- "home"
soccer$HoA[soccer$HoA == "team2"] <- "away"
soccer$spi <- soccer$spi2
soccer$spiOpp <- soccer$spi1
soccer$spi[soccer$HoA == "home"] <-
soccer$spi1[soccer$HoA == "home"]
soccer$spiOpp[soccer$HoA == "home"] <-
soccer$spi2[soccer$HoA == "away"]
soccer <- soccer %>% select(-spi1,-spi2)
soccer$probWin <- soccer$prob1
soccer$probLoss <- soccer$prob2
soccer$probWin[soccer$HoA == "away"] <-
soccer$prob2[soccer$HoA == "away"]
soccer$probLoss[soccer$HoA == "away"] <-
soccer$prob1[soccer$HoA == "home"]
soccer <- soccer %>% select(-prob1,-prob2)
soccer$score <- soccer$score1
soccer$scoreOpp <- soccer$score2
soccer$score[soccer$HoA == "away"] <-
soccer$score2[soccer$HoA == "away"]
soccer$scoreOpp[soccer$HoA == "away"] <-
soccer$score1[soccer$HoA == "home"]
soccer <- soccer %>% select(-score1,-score2)
soccer$proj_score <- soccer$proj_score1
soccer$proj_scoreOpp <- soccer$proj_score2
soccer$proj_score[soccer$HoA == "away"] <-
soccer$proj_score2[soccer$HoA == "away"]
soccer$proj_scoreOpp[soccer$HoA == "away"] <-
soccer$proj_score1[soccer$HoA == "home"]
soccer <- soccer %>% select(-proj_score1,-proj_score2)
soccer$importance <- soccer$importance1
soccer$importanceOpp <- soccer$importance2
soccer$importance[soccer$HoA == "away"] <-
soccer$importance2[soccer$HoA == "away"]
soccer$importanceOpp[soccer$HoA == "away"] <-
soccer$importance1[soccer$HoA == "home"]
soccer <- soccer %>% select(-importance1,-importance2)
soccer$xg <- soccer$xg1
soccer$xgOpp <- soccer$xg2
soccer$xg[soccer$HoA == "away"] <- soccer$xg2[soccer$HoA == "away"]
soccer$xgOpp[soccer$HoA == "away"] <-
soccer$xg1[soccer$HoA == "home"]
soccer <- soccer %>% select(-xg1,-xg2)
soccer$nsxg <- soccer$nsxg1
soccer$nsxgOpp <- soccer$nsxg2
soccer$nsxg[soccer$HoA == "away"] <-
soccer$nsxg2[soccer$HoA == "away"]
soccer$nsxgOpp[soccer$HoA == "away"] <-
soccer$nsxg1[soccer$HoA == "home"]
soccer <- soccer %>% select(-nsxg1,-nsxg2)
soccer$IntComp <- FALSE
soccer$IntComp[soccer$league %in% c("UEFA Champions League", "UEFA Europa League")] <-
TRUE
soccer <- soccer %>% select(-adj_score1, -adj_score2)
str(soccer)
## Classes 'tbl_df', 'tbl' and 'data.frame': 69160 obs. of 22 variables:
## $ match_id : Factor w/ 34580 levels "1","10","100",..: 1 1 11112 11112 22223 22223 27915 27915 29026 29026 ...
## $ date : Date, format: "2016-08-12" "2016-08-12" ...
## $ league_id : int 1843 1843 1843 1843 2411 2411 2411 2411 2411 2411 ...
## $ league : Factor w/ 37 levels "Argentina Primera Division",..: 13 13 13 13 4 4 4 4 4 4 ...
## $ probtie : num 0.116 0.116 0.262 0.262 0.292 ...
## $ HoA : chr "home" "away" "home" "away" ...
## $ team : Factor w/ 758 levels "1. FC Heidenheim 1846",..: 80 510 50 314 340 395 186 742 222 687 ...
## $ spi : num 51.2 85.7 68.8 56.5 53.6 ...
## $ spiOpp : num 85.7 51.2 56.5 68.8 66.8 ...
## $ probWin : num 0.0463 0.838 0.5714 0.1669 0.3459 ...
## $ probLoss : num 0.838 0.0463 0.1669 0.5714 0.3621 ...
## $ score : int 0 1 2 2 2 1 0 1 1 1 ...
## $ scoreOpp : int 1 0 2 2 1 2 1 0 1 1 ...
## $ proj_score : num 0.91 2.36 1.82 0.86 1.16 1.24 1.35 1.14 1.47 1.38 ...
## $ proj_scoreOpp: num 2.36 0.91 0.86 1.82 1.24 1.16 1.14 1.35 1.38 1.47 ...
## $ importance : num 32.4 67.7 53.7 22.9 38.1 22.2 43.6 34.6 31.9 48 ...
## $ importanceOpp: num 67.7 32.4 22.9 53.7 22.2 38.1 34.6 43.6 48 31.9 ...
## $ xg : num 0.97 0.63 2.45 0.77 0.85 2.77 1.11 0.68 0.73 1.11 ...
## $ xgOpp : num 0.63 0.97 0.77 2.45 2.77 0.85 0.68 1.11 1.11 0.73 ...
## $ nsxg : num 0.43 0.45 1.75 0.42 0.17 1.25 0.84 1.6 0.88 1.81 ...
## $ nsxgOpp : num 0.45 0.43 0.42 1.75 1.25 0.17 1.6 0.84 1.81 0.88 ...
## $ IntComp : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
After using tools like cbind and pivot_longer, this is a much more flexible dataframe for our use
soccer$YM <- format(soccer$date, "%Y-%m")
leagueComp <-
soccer %>% group_by(league, YM) %>% summarize(spi = mean(spi)) %>% filter(
league %in% c(
"Barclays Premier League",
"French Ligue 1",
"German Bundesliga",
"Italy Serie A",
"Spanish Primera Division"
)
)
leagueComp %>% ggplot(aes(x = YM, y = spi, color = league)) + geom_point() + geom_line(aes(group = league)) + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + labs(
title = "Average SPI of Top 5 European Leagues by Month",
x = "Year-Month",
y = "SPI" ,
color = "League",
subtitle = "Excluding UEFA Competitions"
)
This plot shows how the Barclays Premier League has climbed to the top the past few seasons and proven itself as one of the best leagues. It also shows the dominance of last seasons spanish teams.
leagueComp <-
soccer %>% group_by(league, YM) %>% summarize(spi = mean(spi)) %>% filter(
league %in% c(
"Barclays Premier League",
"French Ligue 1",
"German Bundesliga",
"Italy Serie A",
"Spanish Primera Division",
"UEFA Champions League",
"UEFA Europa League"
)
)
leagueComp %>% ggplot(aes(x = YM, y = spi, color = league)) + geom_point() + geom_line(aes(group = league)) + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + labs(
title = "Average SPI of Top 5 European Leagues by Month",
x = "Year-Month",
y = "SPI" ,
color = "League",
subtitle = "Including UEFA Competitions"
)
With the inclusion of international leagues, which are made up of top teams from leagues around Europe, we can see the nature of knockout tournaments. Weaker teams are phased out as the stronger teams win. Then the season resets with a wider variety of teams.
PL1920 <-
soccer %>% filter(
league == "Barclays Premier League",
date > as.Date("2019-06-01"),
date < as.Date("2020-03-13")
)
p <-
PL1920 %>% ggplot(aes(x = date, y = spi, color = team)) + geom_point() + geom_line(aes(group = team)) + labs(
title = "SPI by Team in the BPL (2019/2020 Season)",
x = "Date",
y = "SPI" ,
color = "Team"
)
ggplotly(p)
This plot shows the SPI’s for all 20 teams in the BPL. It is hard to discern much information from this as it is very crowded, however we can see thar there are two dominant teams, Manchester City and Liverpool. There is a wide spread in the midfield and a very narrow spread toward the bottom, with one weak standout, Norwich City.
PL1920$Top6 <- FALSE
PL1920$Top6[PL1920$team %in% c(
"Arsenal",
"Chelsea",
"Liverpool",
"Manchester City",
"Manchester United",
"Tottenham Hotspur"
)] <- TRUE
pTop6 <-
PL1920 %>% ggplot(aes(
x = date,
y = spi,
color = Top6,
label = team
)) + geom_point() + geom_line(aes(group = team)) + labs(
title = "SPI by Team in the BPL (2019/2020 Season)",
x = "Date",
y = "SPI" ,
color = "Big 6 Team"
)
ggplotly(pTop6)
Besides Man. City, Liverpool, and parts of Chelsea’s season, the rest of the top four are not much stronger than some midtable teams. Perhaps these teams should not be as highly regarded. Still fairly difficult to make any claims about these teams, since it is still crowded.
top6Only <-
PL1920 %>% filter(Top6 == TRUE) %>% ggplot(aes(x = date, y = spi, color = team)) + geom_point() + geom_line(aes(group = team)) + geom_smooth() + labs(
title = "SPI of Big 6 Teams in the BPL (2019/2020 Season)",
x = "Date",
y = "SPI" ,
color = "Team"
)
ggplotly(top6Only)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
After Isolating the top 6 teams and adding a smooth layer, we can better see the trajectories of these teams. Notable features are Tottenham and Arsenal’s  drop in form throughout the season.
PL1920$NewPro <- FALSE
PL1920$NewPro[PL1920$team %in% c("Aston Villa", "Norwich City", "Sheffield United")] <-
TRUE
pNewPro <-
PL1920 %>% filter(NewPro == FALSE) %>% ggplot(aes(
x = date,
y = spi,
label = team,
color = "Other"
)) + geom_point() + geom_line(aes(group = team, color = "Other")) + geom_point(data = PL1920 %>% filter(NewPro == TRUE), aes(x = date, y = spi, color = team)) + geom_line(data = PL1920 %>% filter(NewPro == TRUE),
aes(
x = date,
y = spi,
color = team,
group = team
)) + geom_smooth(data = PL1920 %>% filter(NewPro == TRUE), aes(x = date, y = spi, color = team)) + labs(
title = "SPI of Newly Promoted Teams in the BPL (2019/2020 Season)",
x = "Date",
y = "SPI" ,
color = "Team"
) + scale_color_discrete(breaks = c("Aston Villa", "Norwich City", "Sheffield United", "Other"))
ggplotly(pNewPro)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
There are many english football teams and the governing body does not want them all in one league so they use a tiered system where teams change leagues at the end of the season based on performance
If we monitor the strength of newly promoted teams, we can see that Sheffield United has had an excellent season in the PL, surpassing the strength of many teams. Additionally, Aston Villa had a promising start but their form fell in the second half of the season. Lastly, Norwich has not proven that the belong in the top flight.
Question: What variables help to predict Win?
Challenge: Cleaning and encoding variables to work with this model.
soccerModelData <- na.omit(soccer)
soccerModelData$win <- FALSE
soccerModelData$spiDiff <- 101
soccerModelData$impDiff <- 101
for (x in 1:nrow(soccerModelData)) {
if (soccerModelData$score[x] - soccerModelData$scoreOpp[x] > 0) {
soccerModelData$win[x] <- 1
} else{
soccerModelData$win[x] = 0
}
if (soccerModelData$HoA[x] == "home") {
soccerModelData$HoA[x] = 1
} else{
soccerModelData$HoA[x] = 0
}
if (soccerModelData$IntComp[x] == TRUE) {
soccerModelData$IntComp[x] = 1
} else{
soccerModelData$IntComp[x] = 0
}
soccerModelData$spiDiff[x] <-
soccerModelData$spi[x] - soccerModelData$spiOpp[x]
soccerModelData$impDiff[x] <-
soccerModelData$importance[x] - soccerModelData$importanceOpp[x]
}
WinModel <-
glm(win ~ spiDiff + impDiff + HoA + IntComp,
data = soccerModelData,
family = binomial(link = logit))
summary(WinModel)
##
## Call:
## glm(formula = win ~ spiDiff + impDiff + HoA + IntComp, family = binomial(link = logit),
## data = soccerModelData)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.1349 -0.9179 -0.6307 1.1111 2.5963
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.0443422 0.0197987 -52.748 < 2e-16 ***
## spiDiff 0.0547615 0.0011191 48.936 < 2e-16 ***
## impDiff 0.0027692 0.0004776 5.798 6.7e-09 ***
## HoA1 0.8744272 0.0260927 33.512 < 2e-16 ***
## IntComp 0.0261310 0.0562287 0.465 0.642
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 39532 on 29917 degrees of freedom
## Residual deviance: 35098 on 29913 degrees of freedom
## AIC: 35108
##
## Number of Fisher Scoring iterations: 4
Here we create a logistic model to predict a win based on the differnce between the two teams on both SPI and importance as well as factors of home or away and international competition.
emptyModel <-
glm(win ~ 1, data = soccerModelData, family = binomial(link = logit))
anova(emptyModel, WinModel, test = "Chisq")
## Analysis of Deviance Table
##
## Model 1: win ~ 1
## Model 2: win ~ spiDiff + impDiff + HoA + IntComp
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 29917 39532
## 2 29913 35098 4 4433.4 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
This anova output tells us that this model helps us predict a result!
AICModel <- step(WinModel)
## Start: AIC=35108.14
## win ~ spiDiff + impDiff + HoA + IntComp
##
## Df Deviance AIC
## - IntComp 1 35098 35106
## <none> 35098 35108
## - impDiff 1 35132 35140
## - HoA 1 36258 36266
## - spiDiff 1 37894 37902
##
## Step: AIC=35106.35
## win ~ spiDiff + impDiff + HoA
##
## Df Deviance AIC
## <none> 35098 35106
## - impDiff 1 35132 35138
## - HoA 1 36258 36264
## - spiDiff 1 37895 37901
summary(AICModel)
##
## Call:
## glm(formula = win ~ spiDiff + impDiff + HoA, family = binomial(link = logit),
## data = soccerModelData)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.1327 -0.9178 -0.6307 1.1110 2.5958
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.0428910 0.0195481 -53.350 < 2e-16 ***
## spiDiff 0.0547714 0.0011189 48.949 < 2e-16 ***
## impDiff 0.0027679 0.0004776 5.796 6.81e-09 ***
## HoA1 0.8743993 0.0260922 33.512 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 39532 on 29917 degrees of freedom
## Residual deviance: 35098 on 29914 degrees of freedom
## AIC: 35106
##
## Number of Fisher Scoring iterations: 4
This function selects a better model by eliminating unhelpful predictors. It eliminates the international competition factor therefore we do not have reason to believe that international competitions impact results.
confusion.glm <- function(model, cutoff = 0.5) {
predicted <- ifelse(predict(model, type = 'response') > cutoff, 1, 0)
observed <- model$y
confusion <- table(observed, predicted)
agreement <- (confusion[1, 1] + confusion[2, 2]) / sum(confusion)
specificity <- confusion[1, 1] / rowSums(confusion)[1]
sensitivity <- confusion[2, 2] / rowSums(confusion)[2]
list(
"Confusion Table" = confusion,
"Agreement" = agreement,
"Sensitivity" = sensitivity,
"Specificity" = specificity
)
}
confusion.glm(AICModel)
## $`Confusion Table`
## predicted
## observed 0 1
## 0 16019 2732
## 1 6610 4557
##
## $Agreement
## [1] 0.6877465
##
## $Sensitivity
## 1
## 0.4080774
##
## $Specificity
## 0
## 0.8543011
This function shows us how well our model can predict results. It ends up with just under 69% accuracy.
We can conclude that the difference in SPI and importnace of the match between teams and the home or away factor helps predict the match result.